### PPP - Qualtrics Raw Data - Cleaning and Prep for Analysis
rm(list=ls())

### setting wd for inputs
setwd("~/Dropbox/Pollution and Public Perceptions in China/Data and Analysis/Data")

### libraries and packages
pkg <- c("plyr", "dplyr", "tidyr", "vars", "doBy")
lapply(pkg, require, character.only = TRUE)

# Loading the data
p <- read.csv("PPP_final.csv", skip = 1)

# colnames(p)

# pruning Qualtrics-generated columns
p$ResponseID <- NULL
p$ResponseSet <- NULL
p$EndDate <- NULL
p$Finished <- NULL
p$opp <- NULL
p$rid <- NULL
p$RISN <- NULL
p$psid <- NULL
p$med <- NULL
p$PID <- NULL
p$K2 <- NULL
p$gid <- NULL
p$sname <- NULL
p$ac <- NULL
p$sn <- NULL
p$lang <- NULL
p$uid <- NULL
p$id <- NULL
p$List <- NULL
p$term <- NULL
p$Shenyang <- NULL
p$Beijing  <- NULL
p$city <- NULL

# sorting only completes 
# note 1 is complete, 2 is terminate 3 is over quota
p0 <- p[which(p$gc=="1"), ]

## verifying: no one does not consent
colnames(p0)
# summary(p0[ ,4]) 

# removing consent questions
p0[ ,4] <- NULL 
p0[ ,4] <- NULL 

# start date needs to inlcude separate columns
p0$start_date <- as.character(p0$StartDate)
# p0$start_date

# splitting the string 
p1 <- p0 %>% separate(start_date, c("begin_date", "begin_time"), sep = " ", 
                      remove = TRUE, convert = TRUE)

colnames(p1)[colnames(p1)=="您现居住地的邮政编码是."] <- "zip"

# subsetting out shenyang respondents
p1.5 <- p1[nchar(as.character(p1$zip)) == 6,]

p1.5$zip <- as.numeric(as.character(p1.5$zip))

p2 <- p1.5[p1.5$zip<110000,]

### renaming all of the variables
colnames(p2)

colnames(p2)[4:51] <- c("gender", "birth_year", "city", "zip", "hukou",
                        "educ", "occup", "income", "soe", "ccp", "insider", "nghbr_trust",
                        "kids", "kids_age", "married","life_sat", "l_sat", "c_sat",
                        "l_corr", "c_corr", "econ", "ineq", "l_cap", "c_cap", "l_vested",
                        "c_vested", "l_opin", "filter", "c_opin", "l_watch", "c_watch", "dem",
                        "l_uneq", "c_uneq", "china_top","west_top", "anti_west", "renew",
                        "contrib", "l_pollute", "n_pollute","month_pollute", "week_pollute", 
                        "day_pollute","check_air", "l_respons", "c_respons", "air_succ")

# Identify and correct incrorrect birth years - some respondents add extra characters, 
# put in their ages, etc.

#first create a respondent index
p2$id <- 1:nrow(p2)

p2.fix <- p2[nchar(as.character(p2$birth_year)) != 4,]  
p2.fix.vars <- c("id","birth_year")
p2.fix <- p2.fix[p2.fix.vars]
# print(p2.fix)

# Then, manually fix them here
p2$birth_year[p2$id==122] <- 1984
p2$birth_year[p2$id==246] <- 1985
p2$birth_year[p2$id==345] <- 1982
p2$birth_year[p2$id==416] <- 1988
p2$birth_year[p2$id==467] <- 1990
p2$birth_year[p2$id==499] <- 1975
p2$birth_year[p2$id==528] <- 1986
p2$birth_year[p2$id==537] <- 1990
p2$birth_year[p2$id==587] <- 1975
p2$birth_year[p2$id==588] <- 1993
p2$birth_year[p2$id==765] <- 1978
p2$birth_year[p2$id==877] <- 1972
p2$birth_year[p2$id==991] <- 1992
p2$birth_year[p2$id==1038] <- 1991
p2$birth_year[p2$id==1048] <- 1990
p2$birth_year[p2$id==1127] <- 1985
p2$birth_year[p2$id==1385] <- 1985
p2$birth_year[p2$id==1459] <- 1990
p2$birth_year[p2$id==1628] <- 1992
p2$birth_year[p2$id==1689] <- 1981

p2$birth_year <- as.numeric(as.character(p2$birth_year))
p2$age <- 2015 - p2$birth_year
p2$age[p2$age == 635] <- NA 
p2$age <- as.numeric(p2$age)
# summary(p2$age)

#  demographics: examining distribution, missingness and recoding missing values 

# gender
p2$gender[p2$gender==3] <- NA
#p2$gender

# hukou
p2$hukou[p2$hukou==3] <- NA

# inverting hukou to avoid confusion 
p2$hukou[p2$hukou==1] <- 0
p2$hukou[p2$hukou==2] <- 1


# education
p2$educ[p2$educ==9] <- NA
#summary(p2$educ)

# state owned emp
p2$soe[p2$soe==3] <- NA
#summary(p2$soe)

# ccp member
p2$ccp[p2$ccp==3] <- NA
#summary(ccp)

# govt affil
p2$insider[p2$insider==3] <- NA
#summary(p2$insider)

# neighbor trust
p2$nghbr_trust[p2$nghbr_trust==6] <- NA
#summary(p2$nghbr_trust)

# kids
p2$kids[p2$kids==3] <- NA
#summary(p2$kids)

# marital status
p2$married[p2$married==5] <- NA
#summary(p2$married)

# recoding  questions for missing values
p2$life_sat[p2$life_sat == 6] <- NA
p2$l_sat[p2$l_sat == 6] <- NA
# note that the c sat variable is increased by 3!
summary(p2$c_sat) # need to rescale
p2$c_sat <- p2$c_sat-3
p2$c_sat[p2$c_sat == 6] <- NA
p2$l_corr[p2$l_corr==6] <- NA
p2$c_corr[p2$c_corr==6] <- NA
p2$econ[p2$econ==6] <- NA
p2$ineq[p2$ineq==6] <- NA
p2$l_cap[p2$l_cap==6] <- NA
p2$c_cap[p2$ c_cap==6] <- NA
p2$l_vested[p2$l_vested==6] <- NA
p2$c_vested[p2$c_vested==6] <- NA
p2$l_opin[p2$l_opin==6] <- NA
p2$c_opin[p2$c_opin==6] <- NA
p2$l_watch[p2$l_watch==6] <- NA
p2$c_watch[p2$c_watch==6] <- NA
p2$dem[p2$dem==6] <- NA
p2$l_uneq[p2$l_uneq==6] <- NA
p2$c_uneq[p2$c_uneq==6] <- NA
p2$china_top[p2$china_top==6] <- NA
p2$west_top[p2$west_top==6] <- NA
p2$anti_west[p2$anti_west==6] <- NA
p2$renew[p2$renew==6] <- NA
p2$contrib[p2$contrib==6] <- NA
p2$l_pollute[p2$l_pollute==7] <- NA
p2$n_pollute[p2$n_pollute==7] <- NA
p2$month_pollute[p2$month_pollute==7] <- NA
p2$week_pollute[p2$week_pollute==7] <- NA
p2$day_pollute[p2$day_pollute==7] <- NA
p2$check_air[p2$check_air==6] <- NA
p2$l_respons[p2$l_respons==6] <- NA
p2$c_respons[p2$c_respons==6] <- NA
p2$air_succ[p2$air_succ==6] <- NA

# Converting the date variable into numeric values
# Including a dummy variable to denote the parade, holiday periods

p2$date_format <- as.Date(p2$begin_date,format="%Y-%m-%d")
p2$date_format <- as.numeric(p2$date_format)  
p2$date_format <- p2$date_format - 16660 

# now verifying w. a table
date.table <-table(p2$begin_date,p2$date_format)
# date.table 

# Renaming the date_format to day
p2$day <- p2$date_format
p2$date_format <- NULL

#### running the code to look at the distribution of time to complete:
summary(p2$Q_TotalDuration) # note one person took over 9 hours we should drop
hist(p2$Q_TotalDuration, breaks = 50, xlim=c(0, 2000))
time <- (p2$Q_TotalDuration)
duration <- sort(p2$Q_TotalDuration, decreasing = FALSE)
# View(duration)

# removing all observations (n=12) who take the survey in less then 2.5 min (150 secs, 
# =3 sec per question)
p2.a <- p2[p2$Q_TotalDuration>=151, ]

# now splittling the begin_time variable 
p.test <- p2.a %>% separate(begin_time, c("begin_hour", "begin_minute", "begin_second"), sep = ":", remove = TRUE, convert = TRUE)

################################
################################
# Manually Assigning AQI VALUES 
################################
################################

# p.test$day (note differences w/ MEP data..)

p.test$aqi <- NA
p.test$aqi[p.test$day==1] <- 152 # MEP is 119
p.test$aqi[p.test$day==2] <- 73 # MEP is 80
p.test$aqi[p.test$day==3] <- 114  # MEP is 201
p.test$aqi[p.test$day==4] <- 160 # MEP is 145
p.test$aqi[p.test$day==5] <- 124   # MEP 90
p.test$aqi[p.test$day==6] <- 147  # MEP 97
p.test$aqi[p.test$day==7] <- 112   # MEP 100
p.test$aqi[p.test$day==8] <- 56   # MEP is 94
p.test$aqi[p.test$day==9] <- 64 # MEP is 56
p.test$aqi[p.test$day==10] <- 65   # MEP is 53
p.test$aqi[p.test$day==11] <- 58 # MEP is N/A
p.test$aqi[p.test$day==12] <- 58  # MEP is50
p.test$aqi[p.test$day==13] <- 45   # MEP is61
p.test$aqi[p.test$day==14] <- 52  # MEP os 60
p.test$aqi[p.test$day==15] <- 58  # MEP is 70
p.test$aqi[p.test$day==16] <- 82   # MEP is 118
p.test$aqi[p.test$day==17] <- 84   # MEP is 74
p.test$aqi[p.test$day==18] <- 85  # MEP IS 37
p.test$aqi[p.test$day==19] <- 45 # MEP IS 34
p.test$aqi[p.test$day==20] <- 35 
p.test$aqi[p.test$day==21] <- 58
p.test$aqi[p.test$day==22] <- 136
p.test$aqi[p.test$day==23] <- 69
p.test$aqi[p.test$day==24] <- 62
p.test$aqi[p.test$day==25] <- 82
p.test$aqi[p.test$day==26] <- 153
p.test$aqi[p.test$day==27] <- 101
p.test$aqi[p.test$day==28] <- 37
p.test$aqi[p.test$day==29] <- 38
p.test$aqi[p.test$day==30] <- 32
p.test$aqi[p.test$day==31] <- 65
p.test$aqi[p.test$day==32] <- 143
p.test$aqi[p.test$day==33] <- 164
p.test$aqi[p.test$day==34] <- 188
p.test$aqi[p.test$day==35] <- 194
p.test$aqi[p.test$day==36] <- 160
p.test$aqi[p.test$day==37] <- 66
p.test$aqi[p.test$day==38] <- 164
p.test$aqi[p.test$day==39] <- 184
p.test$aqi[p.test$day==40] <- 182
p.test$aqi[p.test$day==41] <- 86
p.test$aqi[p.test$day==42] <- 169
p.test$aqi[p.test$day==43] <- 65
p.test$aqi[p.test$day==44] <- 84
p.test$aqi[p.test$day==45] <- 126
p.test$aqi[p.test$day==46] <- 131
p.test$aqi[p.test$day==47] <- 117
p.test$aqi[p.test$day==48] <- 73
p.test$aqi[p.test$day==49] <- 43
p.test$aqi[p.test$day==50] <- 74
p.test$aqi[p.test$day==51] <- 74
p.test$aqi[p.test$day==52] <- 179
p.test$aqi[p.test$day==53] <- 253
p.test$aqi[p.test$day==54] <- 318
p.test$aqi[p.test$day==55] <- 270
p.test$aqi[p.test$day==56] <- 37
p.test$aqi[p.test$day==57] <- 62

# Creating control variables for the parade, national holidays, TJ explosion, and all holidays
# parade
p.test$parade <- ifelse(p.test$day < 7|p.test$day > 21, 0, 1)

# national day holidays (October 1st to 7th)
p.test$national <- ifelse(p.test$day < 49 | p.test$day > 55, 0, 1)
p.test$StartDate[p.test$national == 1] # to confirm

## dummy for TIANJIN PERIOD
p.test$tj <- ifelse(p.test$day < 1 |p.test$day > 6, 0, 1)
p.test$StartDate[p.test$tj == 1] # to confirm

## dummy for holidays other than national day holiday week
p2$holiday <- ifelse(p2$day == 2|p2$day == 3|p2$day == 9|p2$day == 10|p2$day == 16|p2$day == 17|p2$day == 21|
                       p2$day == 22|p2$day == 23|p2$day == 30|p2$day == 31|p2$day == 37
                     |p2$day == 38|p2$day == 44|p2$day == 45, 1, 0)
# to confirm: 
p2$StartDate[p2$holiday == 1]
p2$StartDate[p2$national == 1]

# standardize aqi to a value with mean 0 and standard deviation 1
p.test$aqi_sd <- (p.test$aqi-mean(p.test$aqi))/(sd(p.test$aqi))
sd(p.test$aqi_sd)

# Getting income and age right
p.test$income[p.test$income >= 2000000] <- NA
p.test$age <- as.numeric(as.character(p.test$age))

##### Changing Dichotomous Variables from 1,2 to 0,1

# gender 0 is now male, 1 is female
p.test$gender[p.test$gender==2] <- 0

# hukou 1 is rural, 0 is urban
p.test$hukou[p.test$hukou==2] <- 0

# soe 0 is not, 1 is yes
p.test$soe[p.test$soe==2] <- 2

# ccp 0 is not, 1 is yes
p.test$ccp[p.test$ccp==2] <- 0

# insider 0 is not, 1 is yes
p.test$insider[p.test$insider==2] <- 0

# kids 0 is no, 1 is yes
p.test$kids[p.test$kids==2] <- 0

# marital status - collapsing, 0 is not and 1 is married
p.test$married[p.test$married==2] <- 0
p.test$married[p.test$married==3] <- 0
p.test$married[p.test$married==4] <- 0

# creating an AQI categorical block based on color - subjective perceptions and thresholds
p.test$aqi.color <- NA
p.test$aqi.color[p.test$aqi<50] <- 1
p.test$aqi.color[p.test$aqi>49 & p.test$aqi<100] <- 2
p.test$aqi.color[p.test$aqi>99 & p.test$aqi<150] <- 3
p.test$aqi.color[p.test$aqi>149 & p.test$aqi<200] <- 4
p.test$aqi.color[p.test$aqi>199 & p.test$aqi<250] <- 5
p.test$aqi.color[p.test$aqi>249 & p.test$aqi<300] <- 6
p.test$aqi.color[p.test$aqi>299 & p.test$aqi<350] <- 7 
# table(p.test$aqi.color)

## creating standardized variables for income, age, education
p.test$educ_sd <- scale(p.test$educ, center = TRUE, scale = TRUE)
p.test$income_sd <- scale(p.test$income, center = TRUE, scale = TRUE)
p.test$age_sd <- scale(p.test$age, center = TRUE, scale = TRUE)

### renaming 
p2 <- p.test

############## ############## ############## 
### Now Creating Lags
############## ############## ############## 

# collapsing the data to determine lag length
collapse1 <- summaryBy(l_sat +  econ  + l_watch +	c_watch	+	anti_west	+ l_pollute +	n_pollute	+ month_pollute	+ week_pollute + day_pollute + aqi +parade + national ~ day, FUN=mean, na.rm=TRUE, data=p2)
collapse1 <- data.frame(collapse1)

# Determining lag length
acf(collapse1$aqi.mean) # simple acf shows us that that it is  AR1, AR2 to worry about

# Additionally, The following code runs AIC, HQ, SC, and FPE tests for autocorrelation. 
# Before doing so, let's check for whether there exists
# any linear time trend or a constant term. 
# Such information is required for conducting the tests. 

# Checking for a linear trend
summary(lm(aqi.mean ~ day + parade.mean + national.mean, data = collapse1)) # no linear trend
# The conclusion is that there is no linear time trend within the survey period that affects AQI,
# but a constant seems to exist (indicated by the signficant intercept in the linear trend model)

# Now, check for autocorrelation using VARselect
ex <- as.matrix(cbind(collapse1$parade.mean, collapse1$national.mean)) # matrix for parade as an exgenous value
ex <- as.data.frame(ex)
VARselect(collapse1$aqi.mean, lag.max = 10, 
          type = "cons", exogen = ex,
          season = NULL)
# All suggest optimal lag length of 2 except for SC. 

# order the data by day
p2 <- p2[order(p2$day), ]

# THE LAGGED LOOP
start <- 1 # setting up a "starting value" to facilitate the value generating scheme
span <- rep(NA, length(unique(p2$day))) # creating a span, assigning NAs to it
p2$aqi.lagged <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p2$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p2[p2$day == i,]) + start - 1 # today
  span.i2 <- nrow(p2[p2$day == i+1,]) + span.i # tomorrow
 
  value.to.be.copied <- unique(p2$aqi[start:span.i]) # from the starting value to the end of a day
  p2$aqi.lagged[(span.i+1):span.i2] <- value.to.be.copied # from the starting value of each day to the next
  start <- span.i + 1 # updating the starting value
} # we ignore all the error messages -- will manualy add lag value for 1 for day 1


# Creating a variable for AQIlagged2
start <- 1 # setting up a "starting value" to facilitate the value generating scheme
span <- rep(NA, length(unique(p2$day))) # creating a span, assigning NAs to it
p2$aqi.lagged2 <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p2$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p2[p2$day == i,]) + start - 1 # today
  span.i2 <- nrow(p2[p2$day == i+1,]) + span.i # tomorrow
  
  value.to.be.copied <- unique(p2$aqi.lagged[start:span.i]) # from the starting value to the end of a day
  p2$aqi.lagged2[(span.i+1):span.i2] <- value.to.be.copied # from the starting value of each day to the next
  
  start <- span.i + 1 # updating the starting value
} # we ignore all the error messages 


# Creating a variable for AQIlagged2
start <- 1 # setting up a "starting value" to facilitate the value generating scheme
span <- rep(NA, length(unique(p2$day))) # creating a span, assigning NAs to it
p2$aqi.lagged3 <- NA # creating the lagged aqi variable, initially assigning NAs to all of the entries
for (i in 1:length(unique(p2$day))) { # loop over every day at the aggregated level
  span.i <- nrow(p2[p2$day == i,]) + start - 1 # today
  span.i2 <- nrow(p2[p2$day == i+1,]) + span.i # tomorrow
  
  value.to.be.copied <- unique(p2$aqi.lagged2[start:span.i]) # from the starting value to the end of a day
  p2$aqi.lagged3[(span.i+1):span.i2] <- value.to.be.copied # from the starting value of each day to the next
  
  start <- span.i + 1 # updating the starting value
} # we ignore all the error messages 

# dropping aqi_color
p2$aqi_color <- NULL

###### MANUALLY INSERT VALUES Laggeed 1 and 2 FOR DAY 1 and Lagged 2 for DAY 2

# aqi.lagged for day1
p2$aqi.lagged[p2$day==1] <- 179
p2$aqi.lagged2[p2$day==1] <- 165
p2$aqi.lagged2[p2$day==2] <- 179

p2$aqi.lagged3[p2$day==2] <- 165
p2$aqi.lagged3[p2$day==3] <- 179


# RENAMING ORIGINAL AND STANDARDIZED variables 
p2$aqio <- p2$aqi # keeping the original
p2$aqio.lagged <- p2$aqi.lagged
p2$aqio.lagged2 <- p2$aqi.lagged2
p2$aqio.lagged3 <- p2$aqi.lagged3
p2$aqi <- p2$aqi_sd
p2$ageo <- p2$age # keeping the original
p2$age <- p2$age_sd
p2$incomeo <- p2$income # kepping the original
p2$income <- p2$income_sd
p2$educo <- p2$educ # keeping the original
p2$educ <- p2$educ_sd

# Standardizing AQI

p2$aqi.lagged <- (p2$aqi.lagged-mean(p2$aqi.lagged))/(sd(p2$aqi.lagged))
sd(p2$aqi.lagged)

p2$aqi.lagged2 <- (p2$aqi.lagged2-mean(p2$aqi.lagged2))/(sd(p2$aqi.lagged2))
sd(p2$aqi.lagged2)


# Creating a factor variable denoting each week
# Note that in our sample, day 1 (August 14th) is Tuesday, so the week 1 refers to
# day 1, 2, 3, 4, 5, 6. Week 2 begins with day 7, ending on day 13, and so on.
### We do NOT use nested ifelse here to avoid mistakes
p2$week <- NA
for (i in 1:nrow(p2)) {
  if (p2$day[i] <= 3) {
    p2$week[i] <- 1
  } else {
    if (p2$day[i] <=10) {
      p2$week[i] <- 2
    } else{
      if(p2$day[i] <= 17) {
        p2$week[i] <- 3
      } else{
        if(p2$day[i] <= 24) {
          p2$week[i] <- 4
        } else {
          if (p2$day[i] <= 31) {
            p2$week[i] <- 5
          } else {
            if(p2$day[i] <= 38) {
              p2$week[i] <- 6
            } else {
              if (p2$day[i] <= 45) {
                p2$week[i] <- 7
              } else {
                if (p2$day[i] <= 52) {
                  p2$week[i] <- 8
                }
                else{
                  if(p2$day[i] <= 59) {
                    p2$week[i] <- 9
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}


##############################
##############################
## Variable direction recoding for readability
##############################
##############################

p2$l_sat_NEW <- NA
p2$l_sat_NEW[p2$l_sat==1] <- 5
p2$l_sat_NEW[p2$l_sat==2] <- 4
p2$l_sat_NEW[p2$l_sat==3] <- 3
p2$l_sat_NEW[p2$l_sat==4] <- 2
p2$l_sat_NEW[p2$l_sat==5] <- 1
p2$l_sat <- p2$l_sat_NEW

p2$c_sat_NEW <- NA
p2$c_sat_NEW[p2$c_sat==1] <- 5
p2$c_sat_NEW[p2$c_sat==2] <- 4
p2$c_sat_NEW[p2$c_sat==3] <- 3
p2$c_sat_NEW[p2$c_sat==4] <- 2
p2$c_sat_NEW[p2$c_sat==5] <- 1
p2$c_sat <- p2$c_sat_NEW

p2$econ_NEW <- NA
p2$econ_NEW[p2$econ==1] <- 5
p2$econ_NEW[p2$econ==2] <- 4
p2$econ_NEW[p2$econ==3] <- 3
p2$econ_NEW[p2$econ==4] <- 2
p2$econ_NEW[p2$econ==5] <- 1
p2$econ <- p2$econ_NEW

p2$anti_west_NEW <- NA
p2$anti_west_NEW[p2$anti_west==1] <- 5
p2$anti_west_NEW[p2$anti_west==2] <- 4
p2$anti_west_NEW[p2$anti_west==3] <- 3
p2$anti_west_NEW[p2$anti_west==4] <- 2
p2$anti_west_NEW[p2$anti_west==5] <- 1
p2$anti_west <- p2$anti_west_NEW

p2$life_sat_NEW <- NA
p2$life_sat_NEW[p2$life_sat==1] <- 5
p2$life_sat_NEW[p2$life_sat==2] <- 4
p2$life_sat_NEW[p2$life_sat==3] <- 3
p2$life_sat_NEW[p2$life_sat==4] <- 2
p2$life_sat_NEW[p2$life_sat==5] <- 1
p2$life_sat <- p2$life_sat_NEW

# getting rid of the extra vars
p2$l_sat_NEW <- NULL 
p2$c_sat_NEW <- NULL 
p2$econ_NEW <- NULL 
p2$anti_west_NEW <- NULL 
p2$life_sat_NEW <- NULL 

p2$educ <- as.numeric(p2$educ)
p2$age <- as.numeric(p2$age)
p2$income <- as.numeric(p2$income)

setwd("~/Dropbox/Pollution and Public Perceptions in China/Data and Analysis/Data")

save(p2,file = "ppp_cleaned")